Image source: ftp://public.dhe.ibm.com/software/analytics/spss/documentation/modeler/18.0/en/ModelerCRISPDM.pdf
The following Airbnb activity is included in this dataset:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import sweetviz as sv
from io import StringIO
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
!kaggle datasets download -d airbnb/seattle
!kaggle datasets download -d airbnb/boston
!unzip seattle.zip
!unzip boston.zip
def flag_city(df,city):
df["city"]=city
return df
def concat(df1,df2):
return pd.concat([df1[df1.columns], df2[df1.columns]])
def to_date(df,columns):
for col in columns:
df[col] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')
return df
def price_to_numeric(df,columns):
for col in columns:
df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''))
return df
calendar_dtype = {'listing_id': str}
listing_dtype = {"id": str,"hos_id": str}
reviews_dtype = {"id": str, 'listing_id': str,'reviewer_id': str}
seattle_calendar = flag_city(pd.read_csv("./seattle/calendar.csv",dtype= calendar_dtype),"Seattle")
seattle_listings = flag_city(pd.read_csv("./seattle/listings.csv",dtype= listing_dtype),"Seattle")
seattle_reviews = flag_city(pd.read_csv("./seattle/reviews.csv",dtype= reviews_dtype),"Seattle")
boston_calendar = flag_city(pd.read_csv("./boston/calendar.csv",dtype= calendar_dtype),"Boston")
boston_listings = flag_city(pd.read_csv("./boston/listings.csv",dtype= listing_dtype),"Boston")
boston_reviews = flag_city(pd.read_csv("./boston/reviews.csv",dtype= reviews_dtype),"Boston")
calendar = pd.concat([seattle_calendar, boston_calendar])
calendar = to_date(calendar,["date"])
calendar = price_to_numeric(calendar,["price"])
listings = pd.concat([seattle_listings, boston_listings])
listings = to_date(listings,["host_since","first_review","last_review"])
listings = price_to_numeric(listings,["price","weekly_price","monthly_price","cleaning_fee"])
superhost_cond = (
listings["host_is_superhost"] == 't',
listings["host_is_superhost"] == 'f',
)
listings["host_is_superhost"] = np.select(superhost_cond, ['Superhost', 'Non-Superhost'], default=None)
listings[listings["host_is_superhost"]=='t']["host_is_superhost"] = 'Superhost'
listings[listings["host_is_superhost"]=='f']["host_is_superhost"] = 'Non-Superhost'
reviews = pd.concat([seattle_reviews, boston_reviews])
reviews = to_date(reviews,["date"])
print(f"""
Seattle Data
Number of observation: {np.shape(calendar[calendar["city"]=="Seattle"])[0]} records
Fist observation: {str(calendar[calendar["city"]=="Seattle"]["date"].min())}
Last observation: {str(calendar[calendar["city"]=="Seattle"]["date"].max())}
Total Host: {np.shape(listings[listings["city"]=="Seattle"]["host_id"].unique())[0]}
Total Listing: {np.shape(listings[listings["city"]=="Seattle"]["id"].unique())[0]}
""")
print(f"""
Boston Data
Number of observation: {np.shape(calendar[calendar["city"]=="Boston"])[0]} records
Fist observation: {str(calendar[calendar["city"]=="Boston"]["date"].min())}
Last observation: {str(calendar[calendar["city"]=="Boston"]["date"].max())}
Total Host: {np.shape(listings[listings["city"]=="Boston"]["host_id"].unique())[0]}
Total Listing: {np.shape(listings[listings["city"]=="Boston"]["id"].unique())[0]}
""")
Seattle Data
Number of observation: 1393570 records
Fist observation: 2016-01-04
Last observation: 2017-01-02
Total Host: 2751
Total Listing: 3818
Boston Data
Number of observation: 1308890 records
Fist observation: 2016-09-06
Last observation: 2017-09-05
Total Host: 2181
Total Listing: 3585
analyze_report = sv.analyze(calendar)
analyze_report.show_html("calendar.html",open_browser=True)
analyze_report = sv.analyze(listings)
analyze_report.show_html("listings.html",open_browser=True)
analyze_report = sv.analyze(reviews)
analyze_report.show_html("reviews.html",open_browser=True)
df[df["city"]=="Boston"]["price"].describe()
count 2748.000000 mean 167.188137 std 113.097098 min 11.000000 25% 85.000000 50% 146.000000 75% 215.000000 max 1300.000000 Name: price, dtype: float64
df[df["city"]=="Seattle"]["price"].describe()
count 3166.000000 mean 126.844599 std 90.068497 min 22.000000 25% 75.000000 50% 100.000000 75% 150.000000 max 1000.000000 Name: price, dtype: float64
import plotly.figure_factory as ff
df = listings[listings["price"]<1500]
hist_data = [df[df["city"]=="Boston"]["price"], df[df["city"]=="Seattle"]["price"]]
group_labels = ['Boston', 'Seattle']
# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(
hist_data, group_labels,
show_hist=False,
)
fig.update_layout(title_text='Listing Price Distribution',template='simple_white',xaxis_title="Price",)
fig.show()
majority_type = ["House","Apartment","Condominium","Townhouse", "Bed & Breakfast","Loft"]
fig = px.box(listings[listings["property_type"].isin(majority_type)],
y="property_type", x="price",color="city",
orientation='h'
)
fig.update_layout(
title_text='Price by Type',template='simple_white',
yaxis_title="Type",
xaxis_title="Price",
showlegend=True,
)
fig.update_layout(xaxis_range=[0,800])
fig.show()
import plotly.express as px
from plotly.subplots import make_subplots
df_seattle = listings[listings["city"]=="Seattle"]
df_seattle = (
df_seattle[["city","property_type","id","price"]]
.groupby(['city','property_type'])
.agg({'id':'size', 'price':'mean'})
.reset_index()
).sort_values(by='id', ascending=True)
# Create distplot with curve_type set to 'normal'
fig_seattle = px.bar(df_seattle,
y="property_type", x="id",color="price",text="price",
orientation='h'
).update_layout(
title_text='Seattle Property Type',template='simple_white',
yaxis={'categoryorder':'total ascending'},
yaxis_title="Property Type",
xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')
df_boston = listings[listings["city"]=="Boston"]
df_boston = (
df_boston[["city","property_type","id","price"]]
.groupby(['city','property_type'])
.agg({'id':'size', 'price':'mean'})
.reset_index()
).sort_values(by='id', ascending=True)
# Create distplot with curve_type set to 'normal'
fig_boston = px.bar(df_boston,
y="property_type", x="id",color="price",text="price",
orientation='h'
).update_layout(
title_text='Boston Property Type',template='simple_white',
yaxis={'categoryorder':'total ascending'},
yaxis_title="Property Type",
xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')
figure1_traces = []
figure2_traces = []
for trace in range(len(fig_seattle["data"])):
figure1_traces.append(fig_seattle["data"][trace])
for trace in range(len(fig_boston["data"])):
figure2_traces.append(fig_boston["data"][trace])
fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=('Seattle', 'Boston'))
for traces in figure1_traces:
fig_subplot.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
fig_subplot.append_trace(traces, row=1, col=2)
fig_subplot.update_layout(
title_text='Property Type (color indicate price)',template='simple_white',
yaxis_title="Property Type",
xaxis_title="Number of Listing",
showlegend=True,
).update_traces(texttemplate='%{text:.2s}')
fig_subplot['layout']['xaxis']['title']='Number of Listing'
fig_subplot['layout']['xaxis2']['title']='Number of Listing'
fig_subplot.show()
import plotly.figure_factory as ff
df = listings[(listings["review_scores_rating"].notnull()) & (listings["review_scores_rating"]> 50)]
hist_data = [df[df["city"]=="Boston"]["review_scores_rating"], df[df["city"]=="Seattle"]["review_scores_rating"]]
group_labels = ['Boston', 'Seattle']
# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(
hist_data, group_labels,
show_hist=False,
)
fig.update_layout(title_text='Overall Rating Distribution',template='simple_white',xaxis_title="Review Score",)
fig.show()
import plotly.express as px
from plotly.subplots import make_subplots
df_seattle = listings[listings["city"]=="Seattle"]
df_seattle = (
df_seattle[["city","property_type","id","review_scores_rating","number_of_reviews"]]
.groupby(['city','property_type'])
.agg({'id':'size', 'review_scores_rating':'mean',"number_of_reviews": "sum"})
.reset_index()
).sort_values(by='review_scores_rating', ascending=True)
# Create distplot with curve_type set to 'normal'
fig_seattle = px.bar(df_seattle,
y="property_type", x="review_scores_rating",color="number_of_reviews",text="review_scores_rating",
orientation='h'
).update_layout(
title_text='Seattle Property Type',template='simple_white',
yaxis={'categoryorder':'total ascending'},
yaxis_title="Property Type",
xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')
df_boston = listings[listings["city"]=="Boston"]
df_boston = (
df_boston[["city","property_type","id","review_scores_rating","number_of_reviews"]]
.groupby(['city','property_type'])
.agg({'id':'size', 'review_scores_rating':'mean',"number_of_reviews": "sum"})
.reset_index()
).sort_values(by='review_scores_rating', ascending=True)
df_boston = df_boston[df_boston["review_scores_rating"].notnull()]
# Create distplot with curve_type set to 'normal'
fig_boston = px.bar(df_boston,
y="property_type", x="review_scores_rating",color="number_of_reviews",text="review_scores_rating",
orientation='h'
).update_layout(
title_text='Boston Property Type',template='simple_white',
yaxis={'categoryorder':'total ascending'},
yaxis_title="Property Type",
xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')
figure1_traces = []
figure2_traces = []
for trace in range(len(fig_seattle["data"])):
figure1_traces.append(fig_seattle["data"][trace])
for trace in range(len(fig_boston["data"])):
figure2_traces.append(fig_boston["data"][trace])
fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=('Seattle', 'Boston'),horizontal_spacing = 0.15)
for traces in figure1_traces:
fig_subplot.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
fig_subplot.append_trace(traces, row=1, col=2)
fig_subplot.update_layout(
title_text='Review by Type (color indicate number of review)',template='simple_white',
yaxis_title="Property Type",
xaxis_title="Number of Listing",
showlegend=True,
)
fig_subplot['layout']['xaxis']['title']='Average Rating'
fig_subplot['layout']['xaxis2']['title']='Average Rating'
fig_subplot.show()
correlation = listings[listings["city"]=="Seattle"].corr().reset_index()
correlation["absolute_corr"] = np.abs(correlation["review_scores_rating"]).fillna(0)
correlation = correlation.sort_values(by=['absolute_corr'], ascending=False)[["index","review_scores_rating"]].reset_index(drop=True)
correlation = correlation.rename(columns={"index": "factor"})
correlation = correlation.loc[2:15,:].reset_index(drop=True)
correlation.index += 1
correlation
| factor | review_scores_rating | |
|---|---|---|
| 1 | review_scores_cleanliness | 0.642882 |
| 2 | review_scores_accuracy | 0.621257 |
| 3 | review_scores_communication | 0.540620 |
| 4 | review_scores_checkin | 0.521813 |
| 5 | review_scores_location | 0.368423 |
| 6 | calculated_host_listings_count | -0.219280 |
| 7 | square_feet | 0.143793 |
| 8 | host_listings_count | -0.109357 |
| 9 | host_total_listings_count | -0.109357 |
| 10 | reviews_per_month | 0.087313 |
| 11 | price | 0.055551 |
| 12 | availability_30 | -0.051439 |
| 13 | monthly_price | 0.048595 |
| 14 | availability_60 | -0.048460 |
correlation = listings[listings["city"]=="Boston"].corr().reset_index()
correlation["absolute_corr"] = np.abs(correlation["review_scores_rating"]).fillna(0)
correlation = correlation.sort_values(by=['absolute_corr'], ascending=False)[["index","review_scores_rating"]].reset_index(drop=True)
correlation = correlation.rename(columns={"index": "factor"})
correlation = correlation.loc[2:15,:].reset_index(drop=True)
correlation.index += 1
correlation
| factor | review_scores_rating | |
|---|---|---|
| 1 | review_scores_cleanliness | 0.754327 |
| 2 | review_scores_accuracy | 0.705104 |
| 3 | review_scores_communication | 0.600599 |
| 4 | review_scores_checkin | 0.584034 |
| 5 | review_scores_location | 0.458901 |
| 6 | square_feet | -0.175922 |
| 7 | availability_30 | -0.147676 |
| 8 | calculated_host_listings_count | -0.142220 |
| 9 | availability_60 | -0.138288 |
| 10 | host_listings_count | -0.125055 |
| 11 | host_total_listings_count | -0.125055 |
| 12 | availability_90 | -0.123666 |
| 13 | weekly_price | 0.109238 |
| 14 | price | 0.105651 |
fig = px.scatter(listings, x="review_scores_rating", y="price", color="city")
fig.update_layout(yaxis_range=[0,1000])
fig.show()
import plotly.express as px
from plotly.subplots import make_subplots
df_seattle = listings[listings["city"]=="Seattle"]
df_seattle = (
df_seattle[["city","neighbourhood_cleansed","review_scores_rating","number_of_reviews"]]
.groupby(['city','neighbourhood_cleansed'])
.agg({'review_scores_rating':'mean',"number_of_reviews": "sum"})
.reset_index()
).sort_values(by='review_scores_rating', ascending=True)
df_seattle = (
df_seattle[df_seattle["review_scores_rating"].notnull()]
.sort_values(by='review_scores_rating', ascending=True)
.tail(20)
)
# Create distplot with curve_type set to 'normal'
fig_seattle = px.bar(df_seattle,
y="neighbourhood_cleansed", x="review_scores_rating",color="number_of_reviews",text="review_scores_rating",
orientation='h'
).update_layout(
title_text='Seattle Property Type',template='simple_white',
yaxis={'categoryorder':'total ascending'},
yaxis_title="Property Type",
xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')
df_boston = listings[listings["city"]=="Boston"]
df_boston = (
df_boston[["city","neighbourhood_cleansed","review_scores_rating","number_of_reviews"]]
.groupby(['city','neighbourhood_cleansed'])
.agg({'review_scores_rating':'mean',"number_of_reviews": "sum"})
.reset_index()
).sort_values(by='review_scores_rating', ascending=True)
df_boston = (
df_boston[df_boston["review_scores_rating"].notnull()]
.sort_values(by='review_scores_rating', ascending=True)
.tail(20)
)
# Create distplot with curve_type set to 'normal'
fig_boston = px.bar(df_boston,
y="neighbourhood_cleansed", x="review_scores_rating",color="number_of_reviews",text="review_scores_rating",
orientation='h'
).update_layout(
title_text='Boston Property Type',template='simple_white',
yaxis={'categoryorder':'total ascending'},
yaxis_title="Property Type",
xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')
figure1_traces = []
figure2_traces = []
for trace in range(len(fig_seattle["data"])):
figure1_traces.append(fig_seattle["data"][trace])
for trace in range(len(fig_boston["data"])):
figure2_traces.append(fig_boston["data"][trace])
fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=('Seattle', 'Boston'),horizontal_spacing = 0.22)
for traces in figure1_traces:
fig_subplot.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
fig_subplot.append_trace(traces, row=1, col=2)
fig_subplot.update_layout(
title_text='Review by Type (color indicate number of review)',template='simple_white',
yaxis_title="Property Type",
xaxis_title="Number of Listing",
showlegend=True,
)
fig_subplot['layout']['xaxis']['title']='Average Rating'
fig_subplot['layout']['xaxis2']['title']='Average Rating'
fig_subplot.show()
df_seattle = listings[
(listings["host_is_superhost"].notnull()) &
(listings["city"] == "Seattle") &
(listings["price"] < 800)
]
majority_type = ["House","Apartment","Condominium","Townhouse", "Bed & Breakfast","Loft"]
fig_seattle = px.box(df_seattle[df_seattle["property_type"].isin(majority_type)],
y="property_type", x="price",color="host_is_superhost",
orientation='h',
)
fig_seattle.update_layout(
title_text='Price by Type',template='simple_white',
yaxis_title="Type",
xaxis_title="Price",
)
df_boston = listings[
(listings["host_is_superhost"].notnull()) &
(listings["city"] == "Boston") &
(listings["price"] < 800)
]
majority_type = ["House","Apartment","Condominium","Townhouse", "Bed & Breakfast","Loft"]
fig_boston = px.box(df_boston[df_boston["property_type"].isin(majority_type)],
y="property_type", x="price",color="host_is_superhost",
orientation='h'
)
fig_boston.update_layout(
title_text='Price by Type',template='simple_white',
yaxis_title="Type",
xaxis_title="Price",
showlegend=False,
)
figure1_traces = []
figure2_traces = []
for trace in range(len(fig_seattle["data"])):
figure1_traces.append(fig_seattle["data"][trace])
for trace in range(len(fig_boston["data"])):
figure2_traces.append(fig_boston["data"][trace])
fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=('Seattle', 'Boston'),horizontal_spacing = 0.15,vertical_spacing = 1)
for traces in figure1_traces:
fig_subplot.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
fig_subplot.append_trace(traces, row=1, col=2)
fig_subplot.update_layout(
title_text='Price Superhost vs Non-Superhost',template='simple_white',
yaxis_title="Type",
xaxis_title="Number of Listing",
boxmode='group',
)
fig_subplot['layout']['xaxis']['title']='Price'
fig_subplot['layout']['xaxis2']['title']='Price'
fig_subplot.update_layout(legend=dict(
orientation="h",
))
fig.update_layout(
title="My plot",
xaxis_title="id",
yaxis_title="salary",
legend_title="legend",
font=dict(family="Arial", size=20, color="green")
)
fig_subplot.show()
df = listings[listings["city"]=="Seattle"]
df[['host_is_superhost',
'review_scores_rating','review_scores_accuracy', 'review_scores_cleanliness',
'review_scores_checkin', 'review_scores_communication',
'review_scores_location', 'review_scores_value'
]].groupby(['host_is_superhost']).describe().T.unstack(1)
| host_is_superhost | Non-Superhost | Superhost | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| review_scores_rating | 2420.0 | 93.652893 | 7.192689 | 20.0 | 91.0 | 95.0 | 100.0 | 100.0 | 751.0 | 97.395473 | 2.624384 | 60.0 | 96.0 | 98.0 | 99.0 | 100.0 |
| review_scores_accuracy | 2409.0 | 9.552511 | 0.760750 | 2.0 | 9.0 | 10.0 | 10.0 | 10.0 | 751.0 | 9.905459 | 0.314722 | 8.0 | 10.0 | 10.0 | 10.0 | 10.0 |
| review_scores_cleanliness | 2414.0 | 9.454018 | 0.864743 | 3.0 | 9.0 | 10.0 | 10.0 | 10.0 | 751.0 | 9.885486 | 0.365425 | 6.0 | 10.0 | 10.0 | 10.0 | 10.0 |
| review_scores_checkin | 2409.0 | 9.731424 | 0.663235 | 2.0 | 10.0 | 10.0 | 10.0 | 10.0 | 751.0 | 9.964048 | 0.200098 | 8.0 | 10.0 | 10.0 | 10.0 | 10.0 |
| review_scores_communication | 2416.0 | 9.758278 | 0.635341 | 2.0 | 10.0 | 10.0 | 10.0 | 10.0 | 751.0 | 9.974700 | 0.165406 | 8.0 | 10.0 | 10.0 | 10.0 | 10.0 |
| review_scores_location | 2412.0 | 9.564677 | 0.661469 | 4.0 | 9.0 | 10.0 | 10.0 | 10.0 | 751.0 | 9.750999 | 0.485025 | 6.0 | 10.0 | 10.0 | 10.0 | 10.0 |
| review_scores_value | 2411.0 | 9.359602 | 0.795533 | 2.0 | 9.0 | 9.0 | 10.0 | 10.0 | 751.0 | 9.749667 | 0.471787 | 6.0 | 10.0 | 10.0 | 10.0 | 10.0 |
import plotly
plotly.offline.init_notebook_mode()